
pacman::p_load(tidyverse,
               fst,
               tidymodels,
               smbinning,
               doParallel,
               embed,
               tictoc,
               lubridate,
               glmnet,
               glmnetUtils,
               readxl,
               furrr
)

path_smbinning <- paste0("../../data/pipeline_outputs/", 
                         SPECIAL_SUFFIX, "/", "smbinning_", TRAINING_SUFFIX,
                         "_", RAND_NO_CID_SMALLEST_LARGEST, "/")

# Larry var mean ~450 CCP variables manually checked for credit scoring appropriateness
v_larry_cc_cd <- read_xlsx("../../data/larry_cc_cd_vars.xlsx") %>% 
  pull(var) %>% 
  unique()

df_training_raw <- read_fst(paste0("../../data/pipeline_outputs/", 
                                   SPECIAL_SUFFIX, "/", "train_",
                                   TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/",
                                   "training.fst"),
                            columns = c("cid", "qtr", "t_default", "riskscore", "rand_no_cid", 
                                        all_of(v_larry_cc_cd), "consumer_category")) %>% 
  filter(consumer_category %in% CONSUMER_GROUP)

df_training_outcome <- df_training_raw %>% 
  mutate(
    year = year(qtr),
    year_outcome = (10 * year) + t_default,
    t_default = factor(t_default, levels = c(0, 1))
  ) 

num_obs_ <- nrow(df_training_outcome)

Remove_Single_Valued_Variable <- function(removal_threshold, numeric_or_character) {
  
  variable_type <- switch (numeric_or_character,
                           "numeric" = is.numeric,
                           "character" = is.character
  )
  
  df_training_outcome %>% 
    select(
      where(variable_type)
    ) %>% 
    select(contains("VARNAMEPART")) %>% 
    pivot_longer(everything()) %>% 
    group_by(name, value) %>%
    summarize(
      count = n()
    ) %>% 
    slice_max(., order_by = count) %>% 
    distinct() %>% 
    mutate(
      pct_of_obs = count / num_obs_
    ) %>% 
    filter(pct_of_obs < removal_threshold) %>% 
    pull(name)
  
}

v_numeric <- Remove_Single_Valued_Variable(removal_threshold = .99, numeric_or_character = "numeric")
v_categorical <- Remove_Single_Valued_Variable(removal_threshold = .99, numeric_or_character = "character")



df_numeric <- df_training_outcome %>% 
  select(all_of(v_numeric))

v_var_less_than_5_unique <- df_numeric %>%
  pivot_longer(everything()) %>%
  group_by(name) %>%
  summarize(
    n_distinct = n_distinct(value)
  ) %>%
  filter(n_distinct < 5) %>%
  pull(name)


v_var_5_or_more_unique <- setdiff(names(df_numeric), v_var_less_than_5_unique)

v_concat_dummies <- c(v_categorical, v_var_less_than_5_unique)
v_union_dummies <- union(v_var_less_than_5_unique, v_categorical)

stopifnot("There should not be overlap between categorical and numeric vars." =  
            length(v_concat_dummies) == length(v_union_dummies))

v_vars_to_dummy <- v_union_dummies
v_vars_to_discretize_and_dummy <- v_var_5_or_more_unique

df_selected_vars <- df_training_outcome %>% 
  select(cid, qtr, rand_no_cid, year_outcome, t_default, all_of(c(v_vars_to_dummy, v_vars_to_discretize_and_dummy)))




df_training <- df_selected_vars %>% 
  mutate(
    across(.cols = all_of(v_vars_to_dummy), ~ as.factor(.)), 
    across(.cols = all_of(v_vars_to_dummy), ~ fct_explicit_na(., na_level = "missing")),
    across(.cols = all_of(v_vars_to_discretize_and_dummy), ~ ifelse(is.na(.), -999999, .))
  ) 

df_smbinning <- df_training %>% 
  mutate(
    t_default = as.numeric(as.character(t_default))
  ) 


v_t_default <- as.numeric(as.character(df_training$t_default))


# Smbinning parallel loop -------------------------------------------------------

plan(multisession, workers = 4)

tic()
future_iwalk(df_training[v_vars_to_discretize_and_dummy], .progress = TRUE, function(col_, name) {
  df_to_bin <- tibble(
    t_default = v_t_default,
    value = col_
  ) %>% 
  set_names(nm = c("t_default", name)) %>% 
  data.frame()
  
  cuts <- smbinning(df_to_bin, x = name, y = "t_default")
  
  file_name_out <- paste0(path_smbinning, name, ".rds")
  saveRDS(cuts, file = file_name_out)
  
})
toc()

j = 3
col_ <- df_training[v_vars_to_discretize_and_dummy[j]][[1]]
name <- v_vars_to_discretize_and_dummy[j]

df_to_bin <- tibble(
  t_default = v_t_default,
  value = col_
) %>% 
  set_names(nm = c("t_default", name)) %>% 
  data.frame() %>% 
  glimpse()

set.seed(303)
df_t <- df_to_bin %>% 
  sample_frac(0.1)

cuts <- smbinning(df_t, x = name, y = "t_default")


# Smbinning parallel loop ends -------------------------------------------------------

